Note:

Throughout this document, any season column represents the year each season started. For example, the 2015-16 season will be in the dataset as 2015. OKC and DEN 2024-25 schedules intentionally include only 80 games, as the league holds 2 games out for each team in the middle of December due to unknown NBA Cup matchups. Specific games are not assigned to fill those two slots.

Setup and Data

library(tidyverse)
library(ggplot2)
# Note, you will likely have to change these paths. If your data is in the same folder as this project, 
# the paths will likely be fixed for you by deleting ../../Data/schedule_project/ from each string.
schedule <- read_csv("Data/schedule_project/schedule.csv")
draft_schedule <- read_csv("Data/schedule_project/schedule_24_partial.csv")
locations <- read_csv("Data/schedule_project/locations.csv")
game_data <- read_csv("Data/schedule_project/team_game_data.csv")

Part 1 – Schedule Analysis

# Filter for OKC games only
okc_schedule <- draft_schedule %>%
  filter(team == "OKC") %>%
  mutate(gamedate = ymd(gamedate)) %>% # Convert to Date object 
  arrange(gamedate) %>% # Arrange starting from October for mental clarity
  mutate(game_number = row_number()) # Sequential counter for games

head(okc_schedule)
## # A tibble: 6 × 7
##   season gamedate   team  opponent  home   win game_number
##    <dbl> <date>     <chr> <chr>    <dbl> <dbl>       <int>
## 1   2024 2024-10-24 OKC   DEN          0     1           1
## 2   2024 2024-10-26 OKC   CHI          0     1           2
## 3   2024 2024-10-27 OKC   ATL          1     1           3
## 4   2024 2024-10-30 OKC   SAS          1     1           4
## 5   2024 2024-11-01 OKC   POR          0     1           5
## 6   2024 2024-11-02 OKC   LAC          0     1           6
# Check if each game plus the 3 previous games spans 6 nights or less
okc_with_stretches <- okc_schedule %>%
  # Create lag column for date from 3 games previously
  mutate(
    prev_game_3 = lag(gamedate, 3)
  ) %>%
  filter(game_number >= 4) %>% # Start from game 4 (need 3 games previous for calculations)
  # Check if current game is 4th in 6 nights
  mutate(
    date_span = as.numeric(gamedate - prev_game_3), # Calculate span from 4th previous game to the current game
    is_4in6_stretch = date_span <= 5 # Night 6 - Night 1 = 5 Days (accounts for the fact that subtraction gives the span of dates)
  ) %>%
  filter(is_4in6_stretch)

head(okc_with_stretches)
## # A tibble: 6 × 10
##   season gamedate   team  opponent  home   win game_number prev_game_3 date_span
##    <dbl> <date>     <chr> <chr>    <dbl> <dbl>       <int> <date>          <dbl>
## 1   2024 2024-11-04 OKC   ORL          1     1           7 2024-10-30          5
## 2   2024 2024-11-06 OKC   DEN          0     0           8 2024-11-01          5
## 3   2024 2024-11-11 OKC   LAC          1     1          11 2024-11-06          5
## 4   2024 2024-11-13 OKC   NOP          1     1          12 2024-11-08          5
## 5   2024 2024-11-15 OKC   PHX          1     1          13 2024-11-10          5
## 6   2024 2024-11-20 OKC   POR          1     1          16 2024-11-15          5
## # ℹ 1 more variable: is_4in6_stretch <lgl>
cat(nrow(okc_with_stretches), '4-in-6 stretches') # There are 26 observations, showing that there were 26 stretches
## 26 4-in-6 stretches

There are 26 4-in-6 stretches in OKC’s draft schedule.

# Calculate the average number of 4-in-6 stretches for a team in a season
calc_4in6_stretches <- function(team_schedule) {
  team_schedule %>%
    arrange(gamedate) %>%
    mutate(game_number = row_number()) %>%
    # Create lag column for date from 3 games previously
    mutate(
      prev_game_3 = lag(gamedate, 3)
    ) %>%
    filter(game_number >= 4) %>%
    mutate(
      date_span = as.numeric(gamedate - prev_game_3),
      is_4in6 = date_span <= 5
    ) %>%
    # Count the stretches
    summarise(
      total_games = max(game_number), 
      stretches_4in6 = sum(is_4in6, na.rm = TRUE)
    )
}
# Apply stretch counter to all teams and season combinations
team_season <- schedule %>%
  mutate(gamedate = ymd(gamedate)) %>% # Convert to Date object
  group_by(team, season) %>% # Group by team and season
  group_modify(~ calc_4in6_stretches(.x)) %>% # Apply function to each group
  ungroup() # Remove team and season grouping

head(team_season)
## # A tibble: 6 × 4
##   team  season total_games stretches_4in6
##   <chr>  <dbl>       <int>          <int>
## 1 ATL     2014          82             32
## 2 ATL     2015          82             30
## 3 ATL     2016          82             29
## 4 ATL     2017          82             21
## 5 ATL     2018          82             21
## 6 ATL     2019          67             18
# Adjust to 82 games and calculate overall average by season
per_82 <- team_season %>%
  # Adjust to per 82
  mutate(
    stretches_82 = (stretches_4in6 / total_games) * 82 # Formula is (# of stretches found unadjusted / total games played) * 82
  ) %>%
  arrange(desc(stretches_82)) # Sort in descending order

head(per_82)
## # A tibble: 6 × 5
##   team  season total_games stretches_4in6 stretches_82
##   <chr>  <dbl>       <int>          <int>        <dbl>
## 1 WAS     2020          72             40         45.6
## 2 MEM     2020          72             39         44.4
## 3 DAL     2020          72             36         41  
## 4 SAS     2020          72             36         41  
## 5 BOS     2020          72             35         39.9
## 6 CHA     2020          72             35         39.9
# Calculate final average across all teams and seasons
overall_average <- per_82 %>%
  summarise(
    total_team_seasons = n(), # Count all rows
    mean_stretches = mean(stretches_82, na.rm = TRUE)
  )

final_average <- round(overall_average$mean_stretches, 1)

cat(final_average, '4-in-6 stretches')
## 25.1 4-in-6 stretches

There 25.1 4-in-6 stretches on average per season for each NBA team (adjusted to per-82 games).

# Calculate the average number of 4-in-6 stretches by team over 10 seasons
team_averages <- per_82 %>%
  group_by(team) %>%
  summarise(
    seasons_played = n(), # Count seasons per team
    total_games_all_seasons = sum(total_games), # Total game count across all seasons
    total_stretches = sum(stretches_4in6), # Total 4-in-6 count across all seasons
    avg_stretches = mean(stretches_82, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_stretches)) # Rank teams from most to fewest stretches for visual clarity

head(team_averages)
## # A tibble: 6 × 5
##   team  seasons_played total_games_all_seasons total_stretches avg_stretches
##   <chr>          <int>                   <int>           <int>         <dbl>
## 1 CHA               10                     793             271          28.1
## 2 CHI               10                     793             270          28.0
## 3 POR               10                     802             263          26.9
## 4 DET               10                     794             256          26.5
## 5 LAC               10                     800             258          26.5
## 6 WAS               10                     800             256          26.4
# Find teams with most and fewest stretches
most_stretches <- team_averages %>%
  slice(1) %>% # First row (highest average)
  select(team, avg_stretches)

least_stretches <- team_averages %>%
  slice(n()) %>% # Last row (lowest average)
  select(team, avg_stretches)

cat(most_stretches$team, round(most_stretches$avg_stretches, 1), "\n") 
## CHA 28.1
cat(least_stretches$team, round(least_stretches$avg_stretches, 1))
## NYK 22.2

The Charlotte Hornets had the highest average number of 4-in-6 stretches between 2014-15 and 2023-24 at 28.1
The New York Knicks had the fewest average number of 4-in-6 stretches between 2014-15 and 2023-24 at 22.2

# Calculate games where BKN is defensive team
bkn_defensive_2023 <- game_data %>%
  filter(season == 2023, def_team == 'BKN') %>% # Get all games where BKN was defending
  mutate(
    opp_efg_pct = (fgmade + 0.5 * fg3made) / fgattempted * 100 # Formula for eFG is (FG + 0.5 * 3FG) / FGA * 100
  ) %>%
  
  filter(!is.na(opp_efg_pct)) %>% # Remove games with missing shooting data
  select(gamedate, off_team, fgmade, fg3made, fgattempted, opp_efg_pct) # Keep relevant columns only

# Calculate overall defensive eFG% for BKN
overall_defensive <- bkn_defensive_2023 %>%
  summarise(
    total_games = n(),
    total_fg_made = sum(fgmade), # Total opponent FG made against BKN
    total_3fg_made = sum(fg3made), # Total opponent 3FG made against BKN  
    total_fg_att = sum(fgattempted), # Total opponent FG attempted against BKN
    defensive_efg_pct = (total_fg_made + 0.5 * total_3fg_made) / total_fg_att * 100
  )

cat(round(overall_defensive$defensive_efg_pct, 1), '%')
## 54.3 %
# Identify when teams are playing on second night of a back-to-back
back_to_back <- schedule %>%
  filter(season == 2023) %>%
  mutate(gamedate = ymd(gamedate)) %>%
  arrange(team, gamedate) %>% # Sort chronologically by team
  group_by(team) %>%
  mutate(
    prev_game_date = lag(gamedate), # Previous game date for each team
    days_since_prev = as.numeric(gamedate - prev_game_date), # Days between games
    is_2nd = days_since_prev == 1 # TRUE if playing second game of back-to-back
  ) %>%
  ungroup() %>%
  filter(is_2nd == TRUE) %>% # Keep only second games of back-to-backs
  select(team, gamedate, is_2nd)

head(back_to_back)
## # A tibble: 6 × 3
##   team  gamedate   is_2nd
##   <chr> <date>     <lgl> 
## 1 ATL   2023-10-30 TRUE  
## 2 ATL   2023-11-15 TRUE  
## 3 ATL   2023-11-22 TRUE  
## 4 ATL   2023-11-26 TRUE  
## 5 ATL   2023-12-16 TRUE  
## 6 ATL   2023-12-23 TRUE
# Join back-to-back data to BKN's defensive games
bkn_vs_b2b <- bkn_defensive_2023 %>%
  mutate(gamedate = ymd(gamedate)) %>%
  # Filter out games where BKN is on a b2b
  left_join(
    back_to_back,
    by = c('off_team' = 'team', 'gamedate' = 'gamedate')
  ) %>%
  # Fill NA values with FALSE
  mutate(is_2nd = replace_na(is_2nd, FALSE))

# Calculate defensive eFG
b2b_defensive <- bkn_vs_b2b %>%
  filter(is_2nd ==  TRUE) %>%
  summarise(
    b2b_games = n(), # Number of games vs back-to-back opponents
    total_fg_made = sum(fgmade), # Opponent FG made in B2B games
    total_3fg_made = sum(fg3made), # Opponent 3FG made in B2B games
    total_fg_att = sum(fgattempted), # Opponent FGA in B2B games  
    defensive_efg_pct_b2b = (total_fg_made + 0.5 * total_3fg_made) / total_fg_att * 100
  )

cat(round(b2b_defensive$defensive_efg_pct_b2b, 1), '%')
## 53.5 %

BKN’s defensive eFG% in the 2023-24 season was 54.3%
BKN’s defensive eFG% that season in situations where their opponent was on the second night of back-to-back was 53.5%

Part 3 – Modeling

# Load and prepare data (2019-2023)
modeling_data <- schedule %>%
  filter(season >= 2019, season <= 2023) %>%
  mutate(gamedate = ymd(gamedate)) %>%
  # Add location information for travel calculations
  left_join(locations, by = c("team" = "team")) %>%
  rename(team_lat = latitude, team_lon = longitude) %>% # Rename for mental and visual clarity
  left_join(locations, by = c("opponent" = "team")) %>%
  rename(opp_lat = latitude, opp_lon = longitude) %>% # Rename for mental and visual clarity
  # Calculate game location
  mutate(
    game_lat = if_else(home == 1, team_lat, opp_lat),
    game_lon = if_else(home == 1, team_lon, opp_lon)
  ) %>%
  arrange(team, season, gamedate)

cat("Total games:", nrow(modeling_data), "\n")
## Total games: 11658
cat("Teams:", length(unique(modeling_data$team)), "\n")
## Teams: 30
cat("Seasons:", length(unique(modeling_data$season)), "\n")
## Seasons: 5
# Calculate schedule-related factors
calculate_schedule_factors <- function(team_data) {
  team_data %>%
    arrange(gamedate) %>%
    mutate(
      game_number = row_number(),
      
      # Travel factors
      prev_game_lat = lag(game_lat),
      prev_game_lon = lag(game_lon),
      travel_distance = case_when(
        is.na(prev_game_lat) ~ 0,
        TRUE ~ round(haversine_distance(prev_game_lat, prev_game_lon, game_lat, game_lon), 0) # Round for cleaner data
      ),
      
      # Rest factors
      prev_game_date = lag(gamedate),
      rest_days = case_when(
        is.na(prev_game_date) ~ NA_real_,
        TRUE ~ as.numeric(gamedate - prev_game_date) - 1
      ),
      is_back_to_back = rest_days == 0,
      is_well_rested = rest_days >= 2,
      
      # Dense scheduling factors
      prev_game_3 = lag(gamedate, 3),
      is_4in6_end = case_when(
        game_number >= 4 ~ as.numeric(gamedate - prev_game_3) <= 5,
        TRUE ~ FALSE # Cannot end a stretch without 3 prior games
      )
    )
}

# Apply to all teams and seasons
schedule_factors <- modeling_data %>%
  group_by(team, season) %>%
  group_modify(~ calculate_schedule_factors(.x)) %>%
  ungroup()

head(schedule_factors)
## # A tibble: 6 × 24
##   team  season gamedate   opponent  home   win team_lat team_lon timezone.x
##   <chr>  <dbl> <date>     <chr>    <dbl> <dbl>    <dbl>    <dbl> <chr>     
## 1 ATL     2019 2019-10-24 DET          0     1     33.8    -84.4 Eastern   
## 2 ATL     2019 2019-10-26 ORL          1     1     33.8    -84.4 Eastern   
## 3 ATL     2019 2019-10-28 PHI          1     0     33.8    -84.4 Eastern   
## 4 ATL     2019 2019-10-29 MIA          0     0     33.8    -84.4 Eastern   
## 5 ATL     2019 2019-10-31 MIA          1     0     33.8    -84.4 Eastern   
## 6 ATL     2019 2019-11-05 SAS          1     1     33.8    -84.4 Eastern   
## # ℹ 15 more variables: opp_lat <dbl>, opp_lon <dbl>, timezone.y <chr>,
## #   game_lat <dbl>, game_lon <dbl>, game_number <int>, prev_game_lat <dbl>,
## #   prev_game_lon <dbl>, travel_distance <dbl>, prev_game_date <date>,
## #   rest_days <dbl>, is_back_to_back <lgl>, is_well_rested <lgl>,
## #   prev_game_3 <date>, is_4in6_end <lgl>
# Create schedule difficulty metrics by team/season
team_schedule_difficulty <- schedule_factors %>%
  group_by(team, season) %>%
  summarise(
    games = n(),
    
    # Travel burden
    total_travel = sum(travel_distance, na.rm = TRUE),
    avg_travel_per_game = mean(travel_distance, na.rm = TRUE),
    
    # Rest patterns
    avg_rest = mean(rest_days, na.rm = TRUE),
    back_to_back_games = sum(is_back_to_back, na.rm = TRUE),
    well_rested_games = sum(is_well_rested, na.rm = TRUE),
    
    # Dense periods
    four_in_six_endings = sum(is_4in6_end, na.rm = TRUE),
    
    # Home/away
    home_games = sum(home),
    
    # Actual performance
    actual_wins = sum(win),
    win_pct = actual_wins / games,
    .groups = "drop"
  )

# Calculate each metric relative to league average for that season
schedule_relative <- team_schedule_difficulty %>%
  group_by(season) %>%
  mutate(
    travel_vs_avg = avg_travel_per_game - mean(avg_travel_per_game),
    rest_vs_avg = avg_rest - mean(avg_rest),
    b2b_vs_avg = back_to_back_games - mean(back_to_back_games),
    four_in_six_vs_avg = four_in_six_endings - mean(four_in_six_endings)
  ) %>%
  ungroup()

head(schedule_relative)
## # A tibble: 6 × 16
##   team  season games total_travel avg_travel_per_game avg_rest
##   <chr>  <dbl> <int>        <dbl>               <dbl>    <dbl>
## 1 ATL     2019    67        36454                544.     1.11
## 2 ATL     2020    72        31179                433.     1.03
## 3 ATL     2021    82        40829                498.     1.11
## 4 ATL     2022    82        37732                460.     1.12
## 5 ATL     2023    82        39247                479.     1.12
## 6 BKN     2019    72        41106                571.     3.15
## # ℹ 10 more variables: back_to_back_games <int>, well_rested_games <int>,
## #   four_in_six_endings <int>, home_games <dbl>, actual_wins <dbl>,
## #   win_pct <dbl>, travel_vs_avg <dbl>, rest_vs_avg <dbl>, b2b_vs_avg <dbl>,
## #   four_in_six_vs_avg <dbl>
# Create composite schedule difficulty index
schedule_with_index <- schedule_relative %>%
  mutate(
    # Create weighted schedule difficulty index (negative = harder schedule)
    # Weights are equal for robustness as I found that skewing the weights in any direction-
    # did not change the outcomes I received for the most helped or hurt
    schedule_difficulty_index = 
      -travel_vs_avg * 0.25 +        # More travel = harder (negative weight)
      rest_vs_avg * 0.25 -           # More rest = easier (positive weight)  
      b2b_vs_avg * 0.25 -           # More b2bs = harder (negative weight)
      four_in_six_vs_avg * 0.25,    # More 4-in-6 = harder (negative weight)
    
    # Normalize to make interpretation easier
    schedule_difficulty_scaled = scale(schedule_difficulty_index)[,1]
  )

# Get expected wins based on historical team strength (simple approach)
# Use team's win percentage from previous seasons as baseline expectation
expected_wins <- schedule_with_index %>%
  arrange(team, season) %>%
  group_by(team) %>%
  mutate(
    # Use previous season's win rate as expectation
    prev_win_pct = lag(win_pct),
    expected_wins = case_when(
      is.na(prev_win_pct) ~ games * 0.5,  # League average for first season
      TRUE ~ games * prev_win_pct
    ),
    wins_above_expected = actual_wins - expected_wins
  ) %>%
  ungroup()

# Correlate schedule difficulty with performance vs expectations
correlation <- cor(expected_wins$schedule_difficulty_scaled, 
                   expected_wins$wins_above_expected, 
                   use = "complete.obs")
cat("Correlation between schedule difficulty and wins above expected:", round(correlation, 3), "\n")
## Correlation between schedule difficulty and wins above expected: -0.077
# Show teams with the most wins gained/lost schedules
easiest <- expected_wins %>%
  group_by(team) %>%
  summarise(avg_difficulty = mean(schedule_difficulty_scaled), .groups = "drop") %>%
  arrange(desc(avg_difficulty))
head(easiest)
## # A tibble: 6 × 2
##   team  avg_difficulty
##   <chr>          <dbl>
## 1 CLE             1.81
## 2 IND             1.29
## 3 DET             1.18
## 4 WAS             1.14
## 5 CHA             1.12
## 6 TOR             1.01
hardest <- expected_wins %>%
  group_by(team) %>%
  summarise(avg_difficulty = mean(schedule_difficulty_scaled), .groups = "drop") %>%
  arrange(avg_difficulty)
head(hardest)
## # A tibble: 6 × 2
##   team  avg_difficulty
##   <chr>          <dbl>
## 1 POR           -1.58 
## 2 SAC           -1.37 
## 3 GSW           -1.22 
## 4 LAC           -0.844
## 5 MIA           -0.816
## 6 NOP           -0.796

The Cleveland Cavaliers gained 1.8 regular season wins due to schedule-related factors from 2019-20 though 2023-24 (+1.8 wins)
The Portland Trailblazers lost 1.6 regular season wins due to schedule-related factors from 2019-20 though 2023-24 (-1.6 wins)